###載入套件及資料 ################################################

#Loading multiple .csv files as separate data frames
getwd()
[1] "G:/我的雲端硬碟/交換/MBA/1072/R/midterm"
folder <- "data/"
file_list <- list.files(path = folder, pattern = "*.csv")
#Read in each .csv file 
for (i in 1:length(file_list)){
  assign(file_list[i],
         read.csv(paste(folder,file_list[i],sep=''),stringsAsFactors = F)
  )}
#Rename data
geo <- olist_geolocation_dataset.csv
orders <- olist_orders_dataset.csv
cust <- olist_customers_dataset.csv
sellers <- olist_sellers_dataset.csv
products <- olist_products_dataset.csv
orderitems <- olist_order_items_dataset.csv
payments <- olist_order_payments_dataset.csv
nametrans <- product_category_name_translation.csv
reviews <- olist_order_reviews_dataset.csv
closed <- olist_closed_deals_dataset.csv
marketing <- olist_marketing_qualified_leads_dataset.csv
# 地理資料整理
geo$geolocation_lat<-round(geo$geolocation_lat,3)
geo$geolocation_lng<-round(geo$geolocation_lng,3)

selllocation<-geo %>% group_by(geolocation_city) %>% summarise(selllat = max(geolocation_lat),selllng=max(geolocation_lng))
custlocation<-geo %>% group_by(geolocation_city) %>% summarise(custlat = max(geolocation_lat),custlng=max(geolocation_lng))
# 時間資料處理
orders$order_approved_at<-as.Date(orders$order_approved_at,format="%Y-%m-%d %H:%M:%S")
orders$order_purchase_timestamp<-as.Date(orders$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
orders$order_delivered_carrier_date<-as.Date(orders$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
orders$order_delivered_customer_date<-as.Date(orders$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
orders$order_estimated_delivery_date<-as.Date(orders$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")

table(orders$order_status)

   approved    canceled     created   delivered    invoiced  processing 
          2         625           5       96478         314         301 
    shipped unavailable 
       1107         609 
orderitems$shipping_limit_date<-as.Date(orderitems$shipping_limit_date,format="%Y-%m-%d %H:%M:%S")
# 把各個資料合併
M_1 <- merge(orderitems,sellers,by.x="seller_id",by.y="seller_id")
M_2 <- merge(orders,cust,by.x="customer_id",by.y="customer_id")
M_3 <- merge(M_2,M_1,by="order_id")
M_4 <- merge(M_3,products,by="product_id")
M_5 <- merge(M_4,payments,by="order_id")
M_6 <- merge(M_5,selllocation,by.x="seller_city",by.y="geolocation_city")
M_7 <- merge(M_6,custlocation,by.x="customer_city",by.y="geolocation_city")

colnames(nametrans) <- c("product_category_name","product_category_name_english")
#計算買賣家之間的距離
dist_list <- list()

for (i in 1:nrow(M_7)) {
  
  dist_list[[i]] <- gdist(lon.1 = M_7$selllng[i], 
                          lat.1 = M_7$selllat[i], 
                          lon.2 = M_7$custlng[i], 
                          lat.2 = M_7$custlat[i], 
                          units="miles")
  
}
M_7$distbtwn<-as.integer(dist_list)
M_8<-merge(M_7,nametrans,by="product_category_name")
# 置入巴西地理資料
Brazil<-map_data("world") %>% filter(region=="Brazil")

#排除界外外的資料
M_8 = M_8[M_8$selllat <= 5.27438888,]
M_8 = M_8[M_8$custlat <= 5.27438888,]
#M_15_1 <- review of orderitems、M_15_2 <- review of sellers with order informatio
M_15_1<-merge(x = reviews , y = orderitems, by = "order_id")
M_15_2<-merge(x = M_15_1 , y = sellers, by = "seller_id")
M_15_3<-merge(x = M_15_1, y = closed, by = "seller_id")

Part1:基本概況

1-1.訂單數量
1-2.賣家與顧客的地理分布
#畫出賣家所在地
ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
  geom_point(data= M_8,aes(x=selllng,y=selllat,color=seller_state),size=0.2)

#畫出買家所在地
ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
  geom_point(data= M_8,aes(x=custlng,y=custlat,color=customer_state),size=0.2)

#各產品類別銷售情況及所在地
ggplot() +
  geom_bar(data= M_8,aes(product_category_name_english,fill=seller_state),width=1)+ coord_flip()

#賣家所在地數量
ggplot() +
  geom_bar(data= M_8,aes(seller_state,fill=seller_state),width=1)

#####1-3.危機:一次性顧客過多

#抓出一次性購買消費者特性
Q <- group_by(olist_customers_dataset.csv, customer_unique_id) %>% summarise(nid=n())
Q <- filter(Q, nid==1) 
M_13 <- merge(Q,cust, by="customer_unique_id")
M_14_1 <- merge(M_13, olist_orders_dataset.csv, by="customer_id", all=F)
M_14_2 <- merge(M_14_1,olist_order_items_dataset.csv, by="order_id", all=F)
M_14_3 <- merge(M_14_2, olist_products_dataset.csv, by="product_id", all=F)
colnames(product_category_name_translation.csv) <- c("product_category_name","product_category_name_english")
M_14_4 <- merge(M_14_3,product_category_name_translation.csv, by="product_category_name", all = F)
table(M_14_4$product_category_name_english) %>% sort() ##消費量最多為bed_beath

                  security_and_services               fashion_childrens_clothes 
                                      2                                       7 
                             la_cuisine                       cds_dvds_musicals 
                                     12                                      14 
                  arts_and_craftmanship                           fashion_sport 
                                     19                                      28 
                         home_comfort_2                     diapers_and_hygiene 
                                     28                                      29 
                                flowers       furniture_mattress_and_upholstery 
                                     33                                      34 
                                  music                  fashio_female_clothing 
                                     34                                      40 
                         party_supplies                          books_imported 
                                     42                                      57 
                           dvds_blu_ray                              cine_photo 
                                     60                                      68 
  small_appliances_home_oven_and_coffee                  tablets_printing_image 
                                     75                                      79 
                      furniture_bedroom                 costruction_tools_tools 
                                     94                                      99 
                  fashion_male_clothing                 fashion_underwear_beach 
                                    118                                     118 
                     christmas_supplies               construction_tools_safety 
                                    146                                     183 
                 signaling_and_security                               computers 
                                    187                                     197 
                                    art              agro_industry_and_commerce 
                                    201                                     203 
               costruction_tools_garden                       home_appliances_2 
                                    221                                     224 
                          fashion_shoes                         fixed_telephony 
                                    240                                     252 
                             food_drink          industry_commerce_and_business 
                                    255                                     256 
                        books_technical kitchen_dining_laundry_garden_furniture 
                                    261                                     263 
                       air_conditioning               construction_tools_lights 
                                    274                                     292 
                           market_place                                  drinks 
                                    297                                     335 
                                  audio                            home_confort 
                                    346                                     401 
                  furniture_living_room                                    food 
                                    455                                     466 
                 books_general_interest                       home_construction 
                                    529                                     559 
                        home_appliances                        small_appliances 
                                    633                                     653 
                    musical_instruments         construction_tools_construction 
                                    655                                     868 
                    luggage_accessories                          consoles_games 
                                   1040                                    1097 
                       office_furniture                fashion_bags_accessories 
                                   1610                                    1768 
                               pet_shop                              stationery 
                                   1826                                    2403 
                            electronics                                    baby 
                                   2676                                    2914 
                              perfumery                              cool_stuff 
                                   3231                                    3656 
                                   toys                                    auto 
                                   3901                                    4041 
                           garden_tools                               telephony 
                                   4097                                    4314 
                          watches_gifts                              housewares 
                                   5667                                    6527 
                  computers_accessories                         furniture_decor 
                                   7275                                    7513 
                         sports_leisure                           health_beauty 
                                   7954                                    9103 
                         bed_bath_table 
                                  10001 
onece <- group_by(M_14_4, product_category_name_english) %>% summarise(mean=mean(price), sum=sum(price))##平均消費額最多為computers與總消費最多為health_beauty

#####1-4.產品類別

#尋找出銷售最高的產品
#olist_products_dataset.csv : 32,951 consumers 
table(products$product_category_name) %>% sort(decreasing = TRUE) ##消費最多是 cama_mesa_banho 3029

                               cama_mesa_banho 
                                          3029 
                                 esporte_lazer 
                                          2867 
                              moveis_decoracao 
                                          2657 
                                  beleza_saude 
                                          2444 
                         utilidades_domesticas 
                                          2335 
                                    automotivo 
                                          1900 
                        informatica_acessorios 
                                          1639 
                                    brinquedos 
                                          1411 
                            relogios_presentes 
                                          1329 
                                     telefonia 
                                          1134 
                                         bebes 
                                           919 
                                    perfumaria 
                                           868 
                   fashion_bolsas_e_acessorios 
                                           849 
                                     papelaria 
                                           849 
                                    cool_stuff 
                                           789 
                            ferramentas_jardim 
                                           753 
                                      pet_shop 
                                           719 
                                               
                                           610 
                                   eletronicos 
                                           517 
             construcao_ferramentas_construcao 
                                           400 
                              eletrodomesticos 
                                           370 
                              malas_acessorios 
                                           349 
                                consoles_games 
                                           317 
                             moveis_escritorio 
                                           309 
                         instrumentos_musicais 
                                           289 
                               eletroportateis 
                                           231 
                               casa_construcao 
                                           225 
                        livros_interesse_geral 
                                           216 
                              fashion_calcados 
                                           173 
                                   moveis_sala 
                                           156 
                                  climatizacao 
                                           124 
                               livros_tecnicos 
                                           123 
                                telefonia_fixa 
                                           116 
                                 casa_conforto 
                                           111 
                             alimentos_bebidas 
                                           104 
                                  market_place 
                                           104 
                       fashion_roupa_masculina 
                                            95 
moveis_cozinha_area_de_servico_jantar_e_jardim 
                                            94 
                       sinalizacao_e_seguranca 
                                            93 
              construcao_ferramentas_seguranca 
                                            91 
                            eletrodomesticos_2 
                                            90 
                 construcao_ferramentas_jardim 
                                            88 
                                     alimentos 
                                            82 
                                       bebidas 
                                            81 
             construcao_ferramentas_iluminacao 
                                            78 
                     agro_industria_e_comercio 
                                            74 
                 industria_comercio_e_negocios 
                                            68 
                              artigos_de_natal 
                                            65 
                                         audio 
                                            58 
                                         artes 
                                            55 
                fashion_underwear_e_moda_praia 
                                            53 
                                  dvds_blu_ray 
                                            48 
                                 moveis_quarto 
                                            45 
            construcao_ferramentas_ferramentas 
                                            39 
                             livros_importados 
                                            31 
                   portateis_casa_forno_e_cafe 
                                            31 
                                           pcs 
                                            30 
                                     cine_foto 
                                            28 
                        fashion_roupa_feminina 
                                            27 
                                        musica 
                                            27 
                             artigos_de_festas 
                                            26 
                            artes_e_artesanato 
                                            19 
                               fashion_esporte 
                                            19 
                                        flores 
                                            14 
                               fraldas_higiene 
                                            12 
                                    la_cuisine 
                                            10 
                     moveis_colchao_e_estofado 
                                            10 
 portateis_cozinha_e_preparadores_de_alimentos 
                                            10 
                      tablets_impressao_imagem 
                                             9 
                               casa_conforto_2 
                                             5 
                 fashion_roupa_infanto_juvenil 
                                             5 
                                      pc_gamer 
                                             3 
                            seguros_e_servicos 
                                             2 
                             cds_dvds_musicais 
                                             1 
#互動式圓餅圖 : 不同種類商品的平均評分&其銷量佔比(商品種類後面的數字是其平均分數)
Product2 <- group_by(M_15_3,business_segment)%>%summarize(score2=round(mean(review_score),2),percent=n()/nrow(M_15_3)*100)

plot_ly(Product2, labels = paste(Product2$business_segment,Product2$score2),values = Product2$percent, type = 'pie') %>%
  layout(title = '各種類商品銷售百分比',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

#####1-5.不同時段下的銷售量

#整理消費者購買月份
ts = as.POSIXct(as.character(olist_orders_dataset.csv$order_purchase_timestamp) , format="%Y-%m-%d %T")

ts.bym <- cut(ts, breaks = "month")
dfts <- data.frame(ts,ts.bym)
table(dfts$ts.bym)

2016-09-01 2016-10-01 2016-11-01 2016-12-01 2017-01-01 2017-02-01 2017-03-01 
         4        324          0          1        800       1780       2682 
2017-04-01 2017-05-01 2017-06-01 2017-07-01 2017-08-01 2017-09-01 2017-10-01 
      2404       3700       3245       4026       4331       4285       4631 
2017-11-01 2017-12-01 2018-01-01 2018-02-01 2018-03-01 2018-04-01 2018-05-01 
      7544       5673       7269       6728       7211       6939       6873 
2018-06-01 2018-07-01 2018-08-01 2018-09-01 2018-10-01 
      6167       6292       6512         16          4 
ggplot(dfts, aes(ts.bym),las=1)+ geom_bar() ##最多銷售月份為2018-1-1

#每日尖峰時段
ts.byH <- format(ts,format="%H") %>% data.frame()
ggplot(ts.byH ,aes(.))+ geom_bar()

table(ts.byH)
ts.byH
  00   01   02   03   04   05   06   07   08   09   10   11   12   13   14   15 
2394 1170  510  272  206  188  502 1231 2967 4785 6177 6578 5995 6518 6569 6454 
  16   17   18   19   20   21   22   23 
6675 6150 5769 5982 6193 6217 5816 4123 
##商品種類-月份

M_10 <- merge(olist_orders_dataset.csv, olist_order_items_dataset.csv, by.x = "order_id")
M_11 <- merge(M_10, olist_products_dataset.csv, by.x = "product_id")
M_12 <- merge(M_11, product_category_name_translation.csv, by.x= "product_category_name", by.y="ï..product_category_name")

D <- select(M_12,product_category_name_english, order_purchase_timestamp)

DD = as.POSIXct(as.character(D$order_purchase_timestamp) , format="%Y-%m-%d %T")
DD.bym <- format(DD,format="%m")
D$bym <- DD.bym
E <- filter(D, product_category_name_english=="bed_bath_table"|product_category_name_english=="health_beauty"|product_category_name_english=="sports_leisure"|product_category_name_english=="furniture_decor"|product_category_name_english=="computers_accessories"|product_category_name_english=="housewares" )

summary(D$product_category_name_english) %>% sort()

##table(D$product_category_name_english, D$bym)
ggplot(E ,aes(product_category_name_english, fill=bym))+ geom_bar() ##抓出銷售量最高的六種商品做每個月分的比例圖

ggplot(D, aes(product_category_name_english, fill=bym, width=1))+ geom_bar()+coord_flip()  ##每種商品每個月分的銷售狀況

##按照小時區分
DD.byh <- format(DD,format="%H")
D$byh <- DD.byh
##table(D$product_category_name_english, D$byh)
ggplot(D, aes(product_category_name_english, fill=byh, width=1))+ geom_bar()+coord_flip()##按照時間點的銷售量

#按照小時區分heatmap版本
#Rearranging data from D data frame
deaf = select(D, product_category_name_english,byh)
shook = group_by(deaf, byh, product_category_name_english) %>% summarise(n=n())
jump = spread(shook, byh, n)
jump[is.na(jump)] <- 0 

#Converting product category column into rowname
jump = column_to_rownames(jump, var = "product_category_name_english")

jump = as.matrix(jump)
#Using plot_ly () to draw the interactive heatmap
plot_ly(x=colnames(jump), y=rownames(jump), z = jump, type = "heatmap")

Part2:每年變化趨勢分析

#####2-1.每月的訂單數,新進的買、賣家數

併入兩個Olist 行銷資料

seller_TOP100 <-
  M_8 %>% group_by(seller_id,product_category_name_english) %>% summarise(
  mount = sum(price),
  num = n()) %>% arrange(desc(num)) %>% head(100)

seller_TOP100_2 <- merge(seller_TOP100,closed[,c(1,2,6,8,9,12)],by="seller_id",all.x = T)

marketingdata <- merge(closed[,c(1,2,6,8,9,12)],marketing[,c(1,4)],by = "mql_id")
 
M_9 <-  merge(M_8,marketingdata,by = "seller_id",all.x = T)
#7 畫出Olist的成長概況:每月的訂單數、新進的買賣家數
str(M_9)
'data.frame':   113782 obs. of  45 variables:
 $ seller_id                    : chr  "0015a82c2db000af6aaaf3ae2ecb0532" "0015a82c2db000af6aaaf3ae2ecb0532" "0015a82c2db000af6aaaf3ae2ecb0532" "001cca7ae9ae17fb1caed9dfb1094831" ...
 $ product_category_name        : chr  "eletroportateis" "eletroportateis" "eletroportateis" "ferramentas_jardim" ...
 $ customer_city                : chr  "entre rios de minas" "paracatu" "umuarama" "concordia" ...
 $ seller_city                  : chr  "santo andre" "santo andre" "santo andre" "cariacica" ...
 $ order_id                     : chr  "7f39ba4c9052be115350065d07583cac" "9dc8d1a6f16f1b89874c29c9d8d30447" "d455a8cb295653b55abda06d434ab492" "226e16e7eb6dcefe180d80127468de8b" ...
 $ product_id                   : chr  "a2ff5a97bf95719e38ea2e3b4105bce8" "a2ff5a97bf95719e38ea2e3b4105bce8" "a2ff5a97bf95719e38ea2e3b4105bce8" "08574b074924071f4e201e151b152b4e" ...
 $ customer_id                  : chr  "d7fc82cbeafea77bd0a8fbbf6296e387" "d9442164acf4b03109425633efaa0cfc" "944b72539d7e1f7f7fc6e46639ef1fe3" "1063de5cc98f570e53eccb32723746e2" ...
 $ order_status                 : chr  "delivered" "delivered" "delivered" "delivered" ...
 $ order_purchase_timestamp     : Date, format: "2017-10-18" "2017-10-12" ...
 $ order_approved_at            : Date, format: "2017-10-18" "2017-10-12" ...
 $ order_delivered_carrier_date : Date, format: "2017-10-20" "2017-10-17" ...
 $ order_delivered_customer_date: Date, format: "2017-10-27" "2017-10-24" ...
 $ order_estimated_delivery_date: Date, format: "2017-11-09" "2017-11-06" ...
 $ customer_unique_id           : chr  "9de5797cddb92598755a0f76383ddbbb" "9915eb9f74b6c11aaf04833f65b00e93" "3c7e305796add66698959fc7ad176f6b" "20ae7f5cb408302fe02d95d78c8d4f01" ...
 $ customer_zip_code_prefix     : int  35490 38600 87502 89700 23954 8161 20780 31525 84045 13348 ...
 $ customer_state               : chr  "MG" "MG" "PR" "SC" ...
 $ order_item_id                : int  1 1 1 1 4 1 1 1 1 2 ...
 $ shipping_limit_date          : Date, format: "2017-10-24" "2017-10-18" ...
 $ price                        : num  895 895 895 99 99.9 99 99 110 99.5 89 ...
 $ freight_value                : num  21 21 21 46 14.9 ...
 $ seller_zip_code_prefix       : int  9080 9080 9080 29156 29156 29156 29156 29156 29156 29156 ...
 $ seller_state                 : chr  "SP" "SP" "SP" "ES" ...
 $ product_name_lenght          : int  40 40 40 36 34 36 36 39 33 36 ...
 $ product_description_lenght   : int  849 849 849 450 511 450 450 500 509 450 ...
 $ product_photos_qty           : int  2 2 2 1 4 1 1 1 1 1 ...
 $ product_weight_g             : int  11800 11800 11800 9000 8875 9000 9000 8325 9100 9000 ...
 $ product_length_cm            : int  40 40 40 42 40 42 42 43 42 42 ...
 $ product_height_cm            : int  43 43 43 12 14 12 12 14 13 12 ...
 $ product_width_cm             : int  36 36 36 39 43 39 39 33 39 39 ...
 $ payment_sequential           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ payment_type                 : chr  "credit_card" "credit_card" "credit_card" "credit_card" ...
 $ payment_installments         : int  8 4 10 1 4 4 1 2 1 6 ...
 $ payment_value                : num  916 916 916 290 574 ...
 $ selllat                      : num  -7.22 -7.22 -7.22 -20.13 -20.13 ...
 $ selllng                      : num  -36.6 -36.6 -36.6 -40.3 -40.3 ...
 $ custlat                      : num  -20.7 -17.1 -23.7 -27.2 -22.9 ...
 $ custlng                      : num  -44.1 -43.4 -53.2 -51.9 -44.2 ...
 $ distbtwn                     : int  1049 819 1580 880 313 449 250 245 709 482 ...
 $ product_category_name_english: chr  "small_appliances" "small_appliances" "small_appliances" "garden_tools" ...
 $ mql_id                       : chr  NA NA NA NA ...
 $ business_segment             : chr  NA NA NA NA ...
 $ lead_behaviour_profile       : chr  NA NA NA NA ...
 $ has_company                  : chr  NA NA NA NA ...
 $ business_type                : chr  NA NA NA NA ...
 $ origin                       : chr  NA NA NA NA ...
class(M_9$time)
[1] "NULL"
unique(M_9$time_group)
NULL
M_9[,9]=as.POSIXct(M_9[,9], format="%Y-%m-%d")
M_9$time_group <- format(M_9$order_purchase_timestamp,"%Y%m") %>% as.numeric()
M_9$time <- format(M_9$order_purchase_timestamp,"%Y%m") %>% paste0(.,"28") %>% as.Date("%Y%m%d")

              
time_group <- unique(M_9$time_group) %>% sort() 
num_seller_1 = sapply(1:length(time_group), # start by 2, so i-1 = 1
            function(i) setdiff(M_9$seller_id[M_9$time_group==time_group[i]],unique(M_9$seller_id[M_9$time_group<time_group[i]])) %>% length)
num_seller =  c(0,num_seller_1[-24])

num_customer_1 = sapply(1:length(time_group), # start by 2, so i-1 = 1
            function(i) setdiff(M_9$customer_unique_id[M_9$time_group==time_group[i]],unique(M_9$customer_unique_id[M_9$time_group<time_group[i]])) %>% length)
num_customer =  c(0,num_customer_1[-24])

num_order <- M_9 %>% group_by(time) %>% summarise(
  num_order = length(unique(order_id))
  )

plot <- cbind(time=(sort(unique(M_9$time))),num_seller,num_customer,num_order[,2])
#ggplotly_try 嘗試直接顯示4個軸度
plotly = ggplot(plot, mapping =aes(x=as.factor(time), y=num_order, col=num_customer)) +
  geom_point(mapping = aes(size=num_seller)) +
  geom_text(mapping =aes(label=time), size=0.5) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
  ylab("Order") +
  xlab("Time")
plotly

ggplotly(plotly)  

#####2-2.時間序列的折線圖:每月的訂單數,新進的買、賣家數

#畫出時間序列的折線圖
data.ts<-zoo(plot,plot[,"time"])
plot <- data.ts[1:24,-1]

dygraph(plot,main = "Olist新進買賣家及訂單數走勢") %>%
   dySeries("num_customer", label = "新進顧客數")%>%
   dySeries("num_order", label = "訂單數") %>%
   dySeries("num_seller", axis = 'y2', label = "新進賣家數") %>%
   dyOptions( axisLineColor="orange", 
    gridLineColor="indianred" , fillGraph = F,fillAlpha = 0.2,
    drawGrid  = TRUE,drawPoints=TRUE, pointSize = 1 ) %>% 
   dyAxis("x", label = " 日期 ",   drawGrid = F) %>% 
   dyAxis("y", label = " ", drawGrid = T) %>% 
   dyHighlight(highlightCircleSize = 3,              
    highlightSeriesBackgroundAlpha = 0.2) %>% 
   dyOptions(colors = RColorBrewer::brewer.pal(3, 'Dark2')) %>%
   dyRangeSelector(height = 1)

#####2-3.產品類別評分及銷售情形

#不同商品種類的平均評價分數排名 : air conditioning和handcrafted的平均評價最好
Product <- group_by(M_15_3,business_segment)%>%summarize(score=mean(review_score),quantity=n(),price=mean(price))
arrange(Product,score)
sort(table(M_15_3$business_segment))

                          party                  games_consoles 
                              3                               4 
               air_conditioning                           gifts 
                              5                               8 
                    handcrafted               music_instruments 
                             11                              25 
                           baby                      food_drink 
                             42                              49 
                   phone_mobile             fashion_accessories 
                             56                              60 
                     stationery                small_appliances 
                             60                              69 
          home_office_furniture                 food_supplement 
                             82                             100 
                          books                            toys 
                            106                             121 
                      computers                  bags_backpacks 
                            125                             140 
                home_appliances                 car_accessories 
                            141                             164 
                 sports_leisure                  bed_bath_table 
                            179                             188 
        audio_video_electronics                             pet 
                            275                             288 
construction_tools_house_garden                      home_decor 
                            319                             453 
            household_utilities                         watches 
                            562                             594 
                  health_beauty 
                            821 
#不同類別的平均評價 互動式散點圖
g=qplot(x=Product$score,                               
      y=Product$quantity,                              
      geom="point",  
      main = "不同商品類別的評價、銷量、價格",  
      xlab="平均分數",                          
      ylab="賣出數量",
      color= Product$business_segment,
      size = Product$price
      )
ggplotly(g)

#####2-4.泡泡圖:產品類別評分及銷售情形 -> 發現評分表現下滑

Part3:地區下的銷售/評分情況

#####3-1.州(state)銷售額 #####3-1.州(state)評分 #####3-1.城市(city)銷售額 #####3-1.城市(city)評分 ################以下孫嘉力部分

#score: the average score of each city
score <- M_15_2 %>%
group_by(seller_city) %>% 
  summarise(
  score = mean(review_score),
  pricesum = sum(price)+sum(freight_value)
  ) 
#geolocation: private dataframe, with state,lon and lat of each city
geolocation <- olist_geolocation_dataset.csv
geolocation['seller_city'] = geo['geolocation_city']
geolocation['state'] = geo["geolocation_state"]
geolocation['geolocation_zip_code_prefix'] = NULL
geolocation <- geo %>% 
group_by(geolocation_city) %>% 
summarise(
  lat = mean(geolocation_lat),
  lng = mean(geolocation_lng),
  seller_city = geolocation_city[1],
  state = geolocation_state[1]
  )

#draw review of each state and use addPolygon to fig
city_score <- merge(score,geolocation,by="seller_city")
state_score <- city_score %>%
group_by(state) %>%
summarise(
  score = mean(score),
  pricesum = sum(pricesum)
)

#import Brazill json data 
states <- geojsonio::geojson_read("C:\\Users\\User\\Desktop\\Group3\\midterm\\Brazil.json", what = "sp")

#color setting
bins_score <- c(0,1.0,2.0,3.0,4.0,5.0)
bins_money <- c(0,500,5000,10000,50000,100000,1000000,Inf)
pal_score <- colorBin("Blues", domain = state_score$score, bins = bins_score)
pal_money <- colorBin("Reds"  ,domain = state_score$pricesum, bins = bins_money)

#add score Info of State
labels <- sprintf(
  "<strong>%s</strong><br/> score: %g <br/> total price: %g ",
  state_score$state, 
  state_score$score,
  state_score$pricesum
) %>% lapply(htmltools::HTML)

citylabels <- sprintf(
  "<strong>city: %s</strong><br/> score: %g <br/> total price: %g ",
  city_score$seller_city, 
  city_score$score,
  city_score$pricesum
)%>% lapply(htmltools::HTML)

#form a map of Brazil 
map_Brazil <- leaflet(states) %>%
  setView(lat=-22.074022, lng=-48.74026, zoom = 4)  %>%
  addProviderTiles("MapBox",group = "map", options = providerTileOptions(
    id = "mapbox.light",
    accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) 

#draw city inform
map_Brazil <- map_Brazil %>%  
  addProviderTiles("MapBox",group = "city", options = providerTileOptions(
    id = "mapbox.light",
    accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>% 
  addMarkers(
    lng     = city_score$lng,
    lat     = city_score$lat,
    label   = citylabels,
    labelOptions = labelOptions(
    style = list("font-weight" = "normal", padding = "3px 4px"),
    textsize = "15px",
    direction = "auto"),
    clusterOptions = markerClusterOptions(),
    options = popupOptions(closeButton = TRUE),
    group = "city"
  )      

#draw state labels

map_Brazil <- map_Brazil %>% 
#add state score information 
addPolygons(
  fillColor = ~pal_score(state_score$score),
  weight = 2,
  opacity = 1,
  color = "white",
  dashArray = "3",
  fillOpacity = 0.7,
  highlight = highlightOptions(
    weight = 5,
    color = "#666",
    dashArray = "",
    fillOpacity = 0.7,
    bringToFront = TRUE),
  label = labels,
  labelOptions = labelOptions(
    style = list
    ("font-weight" = "normal", padding = "3px 8px"),
    textsize = "15px",
    direction = "auto"),
  group = "state_score")  %>%
#add state price information 
  addPolygons(
  fillColor = ~pal_money(state_score$pricesum),
  weight = 2,
  opacity = 1,
  color = "white",
  dashArray = "3",
  fillOpacity = 0.7,
  highlight = highlightOptions(
    weight = 5,
    color = "#666",
    dashArray = "",
    fillOpacity = 0.7,
    bringToFront = TRUE),
  label = labels,
  labelOptions = labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "15px",
    direction = "auto"),
  group = "state_price"
  ) %>% 
#add legends 
  addLegend(pal = pal_money, values = ~state_score$pricesum ,opacity =0.7,title = "price legend",
    position = "bottomleft",group = "state_price" ) %>%
  addLegend(pal = pal_score, values = ~state_score$score, opacity = 0.7, title = "score legend",
    position = "bottomleft",group = "state_score") %>% 
# add layer Control 
  addLayersControl(
    overlayGroups  = c("city","state_score","state_price") 
  ) %>% hideGroup("state_price")  %>%
# add mini map
  addProviderTiles(providers$Esri.WorldStreetMap) %>%
  addMiniMap(
    tiles = providers$Esri.WorldStreetMap,
    toggleDisplay = TRUE)

#結果

map_Brazil

Part4:評分迴歸模型

#####4-1.解釋新變數

#8 找對評分的重要變數
#####王
M_9 %>% colnames() #2:4,9:11,15,17,19:20,22,24,25,32,38
 [1] "seller_id"                     "product_category_name"        
 [3] "customer_city"                 "seller_city"                  
 [5] "order_id"                      "product_id"                   
 [7] "customer_id"                   "order_status"                 
 [9] "order_purchase_timestamp"      "order_approved_at"            
[11] "order_delivered_carrier_date"  "order_delivered_customer_date"
[13] "order_estimated_delivery_date" "customer_unique_id"           
[15] "customer_zip_code_prefix"      "customer_state"               
[17] "order_item_id"                 "shipping_limit_date"          
[19] "price"                         "freight_value"                
[21] "seller_zip_code_prefix"        "seller_state"                 
[23] "product_name_lenght"           "product_description_lenght"   
[25] "product_photos_qty"            "product_weight_g"             
[27] "product_length_cm"             "product_height_cm"            
[29] "product_width_cm"              "payment_sequential"           
[31] "payment_type"                  "payment_installments"         
[33] "payment_value"                 "selllat"                      
[35] "selllng"                       "custlat"                      
[37] "custlng"                       "distbtwn"                     
[39] "product_category_name_english" "mql_id"                       
[41] "business_segment"              "lead_behaviour_profile"       
[43] "has_company"                   "business_type"                
[45] "origin"                        "time_group"                   
[47] "time"                         
score <- M_9[,c(5,10:12,16,17,19,20,22,24:29,32,38,46)] %>% group_by(order_id) %>% summarise(
  ship13 = mean(order_delivered_customer_date-order_approved_at),
  ship12 = mean(order_delivered_carrier_date-order_approved_at),
  ship23 = mean(order_delivered_customer_date-order_delivered_carrier_date),
  ship_ratio = mean(freight_value/price),
  price = sum(price),
  AVEvolume = sum(product_length_cm*product_height_cm*product_width_cm)/max(order_item_id),
  description = sum(product_description_lenght)/max(order_item_id),
  photo_num = sum(product_photos_qty)/max(order_item_id),
  pay_installment = sum(payment_installments)/max(order_item_id),
  dist = sum(distbtwn)/max(order_item_id),
  customer_state = customer_state[1],
  seller_state = seller_state[1],
  time_group = time_group[1]
)  



reviews2 <- reviews %>% group_by(order_id) %>% summarise(
  score = mean(review_score)
)
# 建置對評分的回歸模型
score_lm<-merge(score,reviews2,by="order_id")
score_lm[,2:4] <- sapply(score_lm[,2:4], as.numeric)
score_lm <- score_lm[complete.cases(score_lm),]
#回歸結果
model <- lm(score ~ . ,data = score_lm[,c(2:15)])
summary(model)

Call:
lm(formula = score ~ ., data = score_lm[, c(2:15)])

Residuals:
   Min     1Q Median     3Q    Max 
-4.142 -0.416  0.511  0.758  8.714 

Coefficients: (1 not defined because of singularities)
                      Estimate    Std. Error t value             Pr(>|t|)    
(Intercept)      146.676094324  16.673163569    8.80 < 0.0000000000000002 ***
ship13            -0.049347295   0.000507651  -97.21 < 0.0000000000000002 ***
ship12            -0.005992261   0.001248453   -4.80    0.000001591111096 ***
ship23                      NA            NA      NA                   NA    
ship_ratio        -0.102628272   0.013671220   -7.51    0.000000000000061 ***
price             -0.000180322   0.000019886   -9.07 < 0.0000000000000002 ***
AVEvolume          0.000000596   0.000000145    4.12    0.000038470316906 ***
description        0.000017458   0.000005592    3.12              0.00180 ** 
photo_num          0.000393133   0.002037934    0.19              0.84703    
pay_installment   -0.005247201   0.001555012   -3.37              0.00074 ***
dist               0.000028964   0.000010916    2.65              0.00797 ** 
customer_stateAL  -0.030491776   0.151932969   -0.20              0.84094    
customer_stateAM   0.479224704   0.172213086    2.78              0.00539 ** 
customer_stateAP   0.478096075   0.203018981    2.35              0.01853 *  
customer_stateBA  -0.200615374   0.140677011   -1.43              0.15385    
customer_stateCE  -0.082217496   0.143113136   -0.57              0.56564    
customer_stateDF  -0.312117403   0.141656775   -2.20              0.02757 *  
customer_stateES  -0.228977322   0.141859044   -1.61              0.10651    
customer_stateGO  -0.211966355   0.141937848   -1.49              0.13534    
customer_stateMA  -0.183800394   0.146374141   -1.26              0.20923    
customer_stateMG  -0.301321084   0.139791712   -2.16              0.03113 *  
customer_stateMS  -0.139833637   0.146508268   -0.95              0.33986    
customer_stateMT  -0.047076882   0.145234659   -0.32              0.74583    
customer_statePA  -0.005394138   0.144658238   -0.04              0.97025    
customer_statePB  -0.016210304   0.149039725   -0.11              0.91339    
customer_statePE  -0.089017302   0.142290724   -0.63              0.53158    
customer_statePI  -0.108850982   0.149998715   -0.73              0.46804    
customer_statePR  -0.255461151   0.140426033   -1.82              0.06889 .  
customer_stateRJ  -0.367436846   0.139746382   -2.63              0.00856 ** 
customer_stateRN   0.024686391   0.150197592    0.16              0.86945    
customer_stateRO   0.073445526   0.160002503    0.46              0.64622    
customer_stateRR   0.344487911   0.240709613    1.43              0.15239    
customer_stateRS  -0.154763067   0.140198252   -1.10              0.26964    
customer_stateSC  -0.216784107   0.140776966   -1.54              0.12359    
customer_stateSE  -0.114351276   0.154268955   -0.74              0.45855    
customer_stateSP  -0.390899799   0.139637944   -2.80              0.00512 ** 
customer_stateTO  -0.032079331   0.157522517   -0.20              0.83863    
seller_stateBA     0.310311297   0.701754252    0.44              0.65835    
seller_stateCE     0.548047705   0.712232351    0.77              0.44161    
seller_stateDF     0.154739715   0.701112259    0.22              0.82532    
seller_stateES     0.215570007   0.703302483    0.31              0.75922    
seller_stateGO     0.440912837   0.702134681    0.63              0.53003    
seller_stateMA     0.308390531   0.702456053    0.44              0.66065    
seller_stateMG     0.303042716   0.699923113    0.43              0.66504    
seller_stateMS     0.479169855   0.722690379    0.66              0.50731    
seller_stateMT     0.349251758   0.707548811    0.49              0.62158    
seller_statePA     0.667659362   0.820354156    0.81              0.41572    
seller_statePB     0.269953993   0.733877879    0.37              0.71299    
seller_statePE     0.293853538   0.702610819    0.42              0.67578    
seller_statePI     0.478320207   0.789359648    0.61              0.54454    
seller_statePR     0.304523535   0.699920481    0.44              0.66350    
seller_stateRJ     0.288714064   0.700050066    0.41              0.68003    
seller_stateRN     0.183959834   0.720481757    0.26              0.79847    
seller_stateRO     0.048482815   0.856778615    0.06              0.95487    
seller_stateRS     0.320738178   0.700340490    0.46              0.64697    
seller_stateSC     0.310799400   0.700071270    0.44              0.65708    
seller_stateSE    -0.364078942   0.820432032   -0.44              0.65721    
seller_stateSP     0.182444172   0.699815389    0.26              0.79432    
time_group        -0.000702833   0.000082589   -8.51 < 0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.21 on 93393 degrees of freedom
Multiple R-squared:  0.125, Adjusted R-squared:  0.124 
F-statistic:  233 on 57 and 93393 DF,  p-value: <0.0000000000000002

#####4-2.相關係數

#相關係數視覺化
# Libraries
library(GGally)

# Check correlation between variables
cor(score_lm[,c(2:11,14,15)]) 
                    ship13     ship12     ship23 ship_ratio      price
ship13           1.0000000  0.3864676  0.9283265  0.0766294  0.0481980
ship12           0.3864676  1.0000000  0.0158873 -0.0212227  0.0668187
ship23           0.9283265  0.0158873  1.0000000  0.0916289  0.0253181
ship_ratio       0.0766294 -0.0212227  0.0916289  1.0000000 -0.2723965
price            0.0481980  0.0668187  0.0253181 -0.2723965  1.0000000
AVEvolume        0.0621467  0.1181375  0.0197542 -0.0806547  0.3094009
description      0.0050792  0.0057164  0.0032022 -0.1140959  0.2676434
photo_num       -0.0224812 -0.0352931 -0.0101459 -0.0470165  0.1166358
pay_installment  0.0742081  0.0578195  0.0571433 -0.1857385  0.3161935
dist             0.2670480  0.0080955  0.2862451  0.1404717  0.1125701
time_group      -0.0597468 -0.0806666 -0.0322563  0.0093886 -0.0040814
score           -0.3361920 -0.1544938 -0.3021936 -0.0270974 -0.0311427
                AVEvolume description photo_num pay_installment       dist
ship13           0.062147   0.0050792 -0.022481        0.074208  0.2670480
ship12           0.118138   0.0057164 -0.035293        0.057819  0.0080955
ship23           0.019754   0.0032022 -0.010146        0.057143  0.2862451
ship_ratio      -0.080655  -0.1140959 -0.047017       -0.185738  0.1404717
price            0.309401   0.2676434  0.116636        0.316194  0.1125701
AVEvolume        1.000000   0.1158442  0.109449        0.171008  0.0597506
description      0.115844   1.0000000  0.223843        0.074115  0.1392289
photo_num        0.109449   0.2238430  1.000000        0.037662  0.1102882
pay_installment  0.171008   0.0741153  0.037662        1.000000  0.0874214
dist             0.059751   0.1392289  0.110288        0.087421  1.0000000
time_group      -0.039378   0.0221935 -0.013546       -0.060023 -0.0359676
score           -0.018784   0.0111701  0.012524       -0.030453 -0.0380816
                time_group      score
ship13          -0.0597468 -0.3361920
ship12          -0.0806666 -0.1544938
ship23          -0.0322563 -0.3021936
ship_ratio       0.0093886 -0.0270974
price           -0.0040814 -0.0311427
AVEvolume       -0.0393776 -0.0187840
description      0.0221935  0.0111701
photo_num       -0.0135455  0.0125242
pay_installment -0.0600233 -0.0304530
dist            -0.0359676 -0.0380816
time_group       1.0000000 -0.0078071
score           -0.0078071  1.0000000
# Check correlations (as scatterplots), distribution and print corrleation coefficient 
#ggpairs(score_lm[,c(2:11,14,15)]) 
 
# Nice visualization of correlations
ggcorr(score_lm[,c(2:11,14,15)], method = c("everything", "pearson")) 

其他

找出前100大賣家

seller_TOP100 <-
  M_8 %>% group_by(seller_id,product_category_name_english) %>% summarise(
  mount = sum(price),
  num = n()) %>% arrange(desc(num)) %>% head(100)

seller_TOP100_2 <- merge(seller_TOP100,closed[,c(1,2,6,8,9,12)],by="seller_id",all.x = T)

marketingdata <- merge(closed[,c(1,2,6,8,9,12)],marketing[,c(1,4)],by = "mql_id")
 
M_9 <-  merge(M_8,marketingdata,by = "seller_id",all.x = T)

#賣家的性格分類

#分析
A_1 <- M_9 %>% group_by(lead_behaviour_profile,business_segment) %>% summarise(
  n_business = n(),
  mount = sum(price),
  every_mount = mount/n_business
  )  %>% arrange(desc(lead_behaviour_profile))
   
table(A_1$lead_behaviour_profile) 

                cat cat, wolf     eagle     shark      wolf 
       23        26         1        22         7        15 
ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
  geom_point(data= M_9,aes(x=custlng,y=custlat,color=lead_behaviour_profile),size=0.2)

ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
  geom_point(data= M_9[M_9$lead_behaviour_profile=="wolf",],aes(x=custlng,y=custlat,color=lead_behaviour_profile),size=0.2)
Warning: Removed 108718 rows containing missing values (geom_point).

#貨物與體積的關係

v <- mutate(products, product_volume= product_length_cm*product_height_cm*product_width_cm)
ggplot(v, aes(product_photos_qty, product_name_lenght))+ geom_point()
Warning: Removed 610 rows containing missing values (geom_point).

ggplot(v, aes(product_volume))+ geom_bar()
Warning: Removed 2 rows containing non-finite values (stat_count).

table(v$product_photos_qty) ##1 photo

    1     2     3     4     5     6     7     8     9    10    11    12    13 
16489  6263  3860  2428  1484   968   343   192   105    95    46    35     9 
   14    15    17    18    19    20 
    5     8     7     2     1     1 
table(v$product_category_name, v$product_photos_qty)
                                                
                                                    1    2    3    4    5    6
                                                    0    0    0    0    0    0
  agro_industria_e_comercio                        41   10    7    4    2    8
  alimentos                                        56    6    7    3    4    4
  alimentos_bebidas                                66   21    5    4    0    2
  artes                                            32   12    3    0    1    3
  artes_e_artesanato                                6    2    4    2    4    0
  artigos_de_festas                                19    4    0    0    0    2
  artigos_de_natal                                 40    4    5    3    2    1
  audio                                            31   10    7    6    2    2
  automotivo                                      747  320  300  165  196  145
  bebes                                           382  225  123   99   35   31
  bebidas                                          60    8    9    1    0    1
  beleza_saude                                   1732  315  176  114   60   25
  brinquedos                                      537  358  212  137   81   42
  cama_mesa_banho                                2283  507  128   57   27   21
  casa_conforto                                    49   17   43    0    2    0
  casa_conforto_2                                   4    1    0    0    0    0
  casa_construcao                                  70   46   13   32   14   21
  cds_dvds_musicais                                 0    1    0    0    0    0
  cine_foto                                        14    6    4    0    3    0
  climatizacao                                     67   19    7   16   12    2
  consoles_games                                  138   68   46   36   19    4
  construcao_ferramentas_construcao               202   66   41   30   18   31
  construcao_ferramentas_ferramentas               18   11    6    0    1    2
  construcao_ferramentas_iluminacao                44   12    7    7    2    4
  construcao_ferramentas_jardim                    58    6    1    2   11   10
  construcao_ferramentas_seguranca                 40   17   17   11    4    2
  cool_stuff                                      358  166  110   81   33   21
  dvds_blu_ray                                     40    3    2    1    0    2
  eletrodomesticos                                259   52   41    9    2    0
  eletrodomesticos_2                               42   18   13    7    2    3
  eletronicos                                     282   92   84   28   18    7
  eletroportateis                                 110   46   33   22   10    7
  esporte_lazer                                  1576  492  269  214  158   79
  fashion_bolsas_e_acessorios                     299  115  118  131  110   48
  fashion_calcados                                  3   15   20   75   19   15
  fashion_esporte                                   3   13    0    1    0    2
  fashion_roupa_feminina                            6   13    1    4    0    3
  fashion_roupa_infanto_juvenil                     1    0    2    1    1    0
  fashion_roupa_masculina                          26   57    7    4    0    1
  fashion_underwear_e_moda_praia                   14   12   15    1    2    4
  ferramentas_jardim                              339  140   99   76   44   26
  flores                                           12    1    1    0    0    0
  fraldas_higiene                                   7    2    1    1    0    0
  industria_comercio_e_negocios                    36   12    9    6    3    1
  informatica_acessorios                          848  342  231  120   58   25
  instrumentos_musicais                           113   52   39   23   29   12
  la_cuisine                                        9    1    0    0    0    0
  livros_importados                                26    3    0    0    0    1
  livros_interesse_geral                          155   22   11    8    8    7
  livros_tecnicos                                 117    4    0    1    0    1
  malas_acessorios                                152   54   49   29   24   14
  market_place                                     38   20   20   13    5    4
  moveis_colchao_e_estofado                         4    4    0    2    0    0
  moveis_cozinha_area_de_servico_jantar_e_jardim   33   20    9   11   13    2
  moveis_decoracao                               1119  650  403  209   98   50
  moveis_escritorio                               260   27    9    4    1    7
  moveis_quarto                                    19   16    4    3    1    2
  moveis_sala                                      63   56   20   10    6    1
  musica                                           10    8    6    2    1    0
  papelaria                                       295  182  157   90   53   26
  pc_gamer                                          0    0    1    1    0    1
  pcs                                              12    5    0    9    2    1
  perfumaria                                      587  172   57   25   10    9
  pet_shop                                        298  179  103   51   28   28
  portateis_casa_forno_e_cafe                      18    2    4    2    1    2
  portateis_cozinha_e_preparadores_de_alimentos     6    1    0    2    1    0
  relogios_presentes                              541  370  205  101   55   28
  seguros_e_servicos                                1    0    0    1    0    0
  sinalizacao_e_seguranca                          39   21   14    6    5    1
  tablets_impressao_imagem                          4    0    2    0    2    0
  telefonia                                       368  301  176   95   49  110
  telefonia_fixa                                   28   27   32   12    8    1
  utilidades_domesticas                          1177  403  312  207  124   53
                                                
                                                    7    8    9   10   11   12
                                                    0    0    0    0    0    0
  agro_industria_e_comercio                         1    1    0    0    0    0
  alimentos                                         2    0    0    0    0    0
  alimentos_bebidas                                 4    1    0    1    0    0
  artes                                             3    0    0    1    0    0
  artes_e_artesanato                                1    0    0    0    0    0
  artigos_de_festas                                 1    0    0    0    0    0
  artigos_de_natal                                  1    3    6    0    0    0
  audio                                             0    0    0    0    0    0
  automotivo                                       11   10    4    2    0    0
  bebes                                            13    1    4    5    0    0
  bebidas                                           0    0    2    0    0    0
  beleza_saude                                     10    7    2    3    0    0
  brinquedos                                       17   10    7    7    1    1
  cama_mesa_banho                                   1    4    1    0    0    0
  casa_conforto                                     0    0    0    0    0    0
  casa_conforto_2                                   0    0    0    0    0    0
  casa_construcao                                  22    7    0    0    0    0
  cds_dvds_musicais                                 0    0    0    0    0    0
  cine_foto                                         1    0    0    0    0    0
  climatizacao                                      1    0    0    0    0    0
  consoles_games                                    2    2    2    0    0    0
  construcao_ferramentas_construcao                 6    4    1    1    0    0
  construcao_ferramentas_ferramentas                0    1    0    0    0    0
  construcao_ferramentas_iluminacao                 2    0    0    0    0    0
  construcao_ferramentas_jardim                     0    0    0    0    0    0
  construcao_ferramentas_seguranca                  0    0    0    0    0    0
  cool_stuff                                       11    3    1    4    0    0
  dvds_blu_ray                                      0    0    0    0    0    0
  eletrodomesticos                                  1    1    0    5    0    0
  eletrodomesticos_2                                4    0    1    0    0    0
  eletronicos                                       5    0    0    1    0    0
  eletroportateis                                   2    0    0    0    0    0
  esporte_lazer                                    27   34    7   10    1    0
  fashion_bolsas_e_acessorios                      17    4    0    0    0    0
  fashion_calcados                                  0    1    0    0    0   25
  fashion_esporte                                   0    0    0    0    0    0
  fashion_roupa_feminina                            0    0    0    0    0    0
  fashion_roupa_infanto_juvenil                     0    0    0    0    0    0
  fashion_roupa_masculina                           0    0    0    0    0    0
  fashion_underwear_e_moda_praia                    0    0    2    0    2    1
  ferramentas_jardim                               15    6    5    2    1    0
  flores                                            0    0    0    0    0    0
  fraldas_higiene                                   0    1    0    0    0    0
  industria_comercio_e_negocios                     1    0    0    0    0    0
  informatica_acessorios                            8    6    1    0    0    0
  instrumentos_musicais                             3    7    7    4    0    0
  la_cuisine                                        0    0    0    0    0    0
  livros_importados                                 1    0    0    0    0    0
  livros_interesse_geral                            2    1    1    1    0    0
  livros_tecnicos                                   0    0    0    0    0    0
  malas_acessorios                                 10    6    5    6    0    0
  market_place                                      0    1    3    0    0    0
  moveis_colchao_e_estofado                         0    0    0    0    0    0
  moveis_cozinha_area_de_servico_jantar_e_jardim    4    2    0    0    0    0
  moveis_decoracao                                 42   24   19    9   33    0
  moveis_escritorio                                 1    0    0    0    0    0
  moveis_quarto                                     0    0    0    0    0    0
  moveis_sala                                       0    0    0    0    0    0
  musica                                            0    0    0    0    0    0
  papelaria                                        19    8    7   12    0    0
  pc_gamer                                          0    0    0    0    0    0
  pcs                                               1    0    0    0    0    0
  perfumaria                                        6    1    0    0    0    0
  pet_shop                                          7    5    1    4    0    2
  portateis_casa_forno_e_cafe                       1    1    0    0    0    0
  portateis_cozinha_e_preparadores_de_alimentos     0    0    0    0    0    0
  relogios_presentes                               13   10    5    1    0    0
  seguros_e_servicos                                0    0    0    0    0    0
  sinalizacao_e_seguranca                           4    2    1    0    0    0
  tablets_impressao_imagem                          1    0    0    0    0    0
  telefonia                                         8    4    3    5    5    4
  telefonia_fixa                                    0    2    3    1    2    0
  utilidades_domesticas                            30   11    4   10    1    2
                                                
                                                   13   14   15   17   18   19
                                                    0    0    0    0    0    0
  agro_industria_e_comercio                         0    0    0    0    0    0
  alimentos                                         0    0    0    0    0    0
  alimentos_bebidas                                 0    0    0    0    0    0
  artes                                             0    0    0    0    0    0
  artes_e_artesanato                                0    0    0    0    0    0
  artigos_de_festas                                 0    0    0    0    0    0
  artigos_de_natal                                  0    0    0    0    0    0
  audio                                             0    0    0    0    0    0
  automotivo                                        0    0    0    0    0    0
  bebes                                             0    0    0    0    0    1
  bebidas                                           0    0    0    0    0    0
  beleza_saude                                      0    0    0    0    0    0
  brinquedos                                        0    0    0    0    0    0
  cama_mesa_banho                                   0    0    0    0    0    0
  casa_conforto                                     0    0    0    0    0    0
  casa_conforto_2                                   0    0    0    0    0    0
  casa_construcao                                   0    0    0    0    0    0
  cds_dvds_musicais                                 0    0    0    0    0    0
  cine_foto                                         0    0    0    0    0    0
  climatizacao                                      0    0    0    0    0    0
  consoles_games                                    0    0    0    0    0    0
  construcao_ferramentas_construcao                 0    0    0    0    0    0
  construcao_ferramentas_ferramentas                0    0    0    0    0    0
  construcao_ferramentas_iluminacao                 0    0    0    0    0    0
  construcao_ferramentas_jardim                     0    0    0    0    0    0
  construcao_ferramentas_seguranca                  0    0    0    0    0    0
  cool_stuff                                        1    0    0    0    0    0
  dvds_blu_ray                                      0    0    0    0    0    0
  eletrodomesticos                                  0    0    0    0    0    0
  eletrodomesticos_2                                0    0    0    0    0    0
  eletronicos                                       0    0    0    0    0    0
  eletroportateis                                   0    1    0    0    0    0
  esporte_lazer                                     0    0    0    0    0    0
  fashion_bolsas_e_acessorios                       1    2    4    0    0    0
  fashion_calcados                                  0    0    0    0    0    0
  fashion_esporte                                   0    0    0    0    0    0
  fashion_roupa_feminina                            0    0    0    0    0    0
  fashion_roupa_infanto_juvenil                     0    0    0    0    0    0
  fashion_roupa_masculina                           0    0    0    0    0    0
  fashion_underwear_e_moda_praia                    0    0    0    0    0    0
  ferramentas_jardim                                0    0    0    0    0    0
  flores                                            0    0    0    0    0    0
  fraldas_higiene                                   0    0    0    0    0    0
  industria_comercio_e_negocios                     0    0    0    0    0    0
  informatica_acessorios                            0    0    0    0    0    0
  instrumentos_musicais                             0    0    0    0    0    0
  la_cuisine                                        0    0    0    0    0    0
  livros_importados                                 0    0    0    0    0    0
  livros_interesse_geral                            0    0    0    0    0    0
  livros_tecnicos                                   0    0    0    0    0    0
  malas_acessorios                                  0    0    0    0    0    0
  market_place                                      0    0    0    0    0    0
  moveis_colchao_e_estofado                         0    0    0    0    0    0
  moveis_cozinha_area_de_servico_jantar_e_jardim    0    0    0    0    0    0
  moveis_decoracao                                  0    0    1    0    0    0
  moveis_escritorio                                 0    0    0    0    0    0
  moveis_quarto                                     0    0    0    0    0    0
  moveis_sala                                       0    0    0    0    0    0
  musica                                            0    0    0    0    0    0
  papelaria                                         0    0    0    0    0    0
  pc_gamer                                          0    0    0    0    0    0
  pcs                                               0    0    0    0    0    0
  perfumaria                                        0    1    0    0    0    0
  pet_shop                                          0    1    3    7    2    0
  portateis_casa_forno_e_cafe                       0    0    0    0    0    0
  portateis_cozinha_e_preparadores_de_alimentos     0    0    0    0    0    0
  relogios_presentes                                0    0    0    0    0    0
  seguros_e_servicos                                0    0    0    0    0    0
  sinalizacao_e_seguranca                           0    0    0    0    0    0
  tablets_impressao_imagem                          0    0    0    0    0    0
  telefonia                                         6    0    0    0    0    0
  telefonia_fixa                                    0    0    0    0    0    0
  utilidades_domesticas                             1    0    0    0    0    0
                                                
                                                   20
                                                    0
  agro_industria_e_comercio                         0
  alimentos                                         0
  alimentos_bebidas                                 0
  artes                                             0
  artes_e_artesanato                                0
  artigos_de_festas                                 0
  artigos_de_natal                                  0
  audio                                             0
  automotivo                                        0
  bebes                                             0
  bebidas                                           0
  beleza_saude                                      0
  brinquedos                                        1
  cama_mesa_banho                                   0
  casa_conforto                                     0
  casa_conforto_2                                   0
  casa_construcao                                   0
  cds_dvds_musicais                                 0
  cine_foto                                         0
  climatizacao                                      0
  consoles_games                                    0
  construcao_ferramentas_construcao                 0
  construcao_ferramentas_ferramentas                0
  construcao_ferramentas_iluminacao                 0
  construcao_ferramentas_jardim                     0
  construcao_ferramentas_seguranca                  0
  cool_stuff                                        0
  dvds_blu_ray                                      0
  eletrodomesticos                                  0
  eletrodomesticos_2                                0
  eletronicos                                       0
  eletroportateis                                   0
  esporte_lazer                                     0
  fashion_bolsas_e_acessorios                       0
  fashion_calcados                                  0
  fashion_esporte                                   0
  fashion_roupa_feminina                            0
  fashion_roupa_infanto_juvenil                     0
  fashion_roupa_masculina                           0
  fashion_underwear_e_moda_praia                    0
  ferramentas_jardim                                0
  flores                                            0
  fraldas_higiene                                   0
  industria_comercio_e_negocios                     0
  informatica_acessorios                            0
  instrumentos_musicais                             0
  la_cuisine                                        0
  livros_importados                                 0
  livros_interesse_geral                            0
  livros_tecnicos                                   0
  malas_acessorios                                  0
  market_place                                      0
  moveis_colchao_e_estofado                         0
  moveis_cozinha_area_de_servico_jantar_e_jardim    0
  moveis_decoracao                                  0
  moveis_escritorio                                 0
  moveis_quarto                                     0
  moveis_sala                                       0
  musica                                            0
  papelaria                                         0
  pc_gamer                                          0
  pcs                                               0
  perfumaria                                        0
  pet_shop                                          0
  portateis_casa_forno_e_cafe                       0
  portateis_cozinha_e_preparadores_de_alimentos     0
  relogios_presentes                                0
  seguros_e_servicos                                0
  sinalizacao_e_seguranca                           0
  tablets_impressao_imagem                          0
  telefonia                                         0
  telefonia_fixa                                    0
  utilidades_domesticas                             0
ggplot(v,aes(product_name_lenght))+ geom_bar()  ##左尾分布 集中在接近60
Warning: Removed 610 rows containing non-finite values (stat_count).

ggplot(v,aes(product_description_lenght))+ geom_bar() ##右尾分布 集中在0~1000
Warning: Removed 610 rows containing non-finite values (stat_count).

ggplot(v,aes(product_volume))+ geom_histogram(binwidth = 1500)
Warning: Removed 2 rows containing non-finite values (stat_bin).

by_name <- group_by(v,product_category_name) %>% summarise(avg = mean(product_volume)) %>% as.data.frame() ##貨物平均體積
by_name <- by_name[-1,]
product_category_name_translation.csv$product_category_name<- product_category_name_translation.csv$product_category_name

m <- merge(by_name,product_category_name_translation.csv,all = F)
m <- m[,-3]  ##貨物種類-平均體積

#三年熱賣前10大商品

product_year=data.frame(year=format(M_8$order_purchase_timestamp,"%Y"),product=M_8$product_category_name_english)
product2016=subset(product_year,year==2016)
table16=table(product2016$product)
product2017=subset(product_year,year==2017)
table17=table(product2017$product)
product2018=subset(product_year,year==2018)
table18=table(product2018$product)
table16=sort(table16,decreasing=TRUE) %>% head(10) #2016年銷售前10名
table17=sort(table17,decreasing=TRUE) %>% head(10) #2017年銷售前10名

pie(table16[1:5],main="2016")

pie(table17[1:5],main="2017")

pie(table18[1:5],main="2018")

#運費佔價格比例與評分的關聯

#運費佔價格比例與評分的關聯,圖形上看起來,運費佔比越高,消費者對商品評價越低。但運費佔比與評價的相關係數很小
M_15_4<-M_15_2[,-5]
M_15_5=mutate(M_15_4,fdividep=freight_value/price)
ggplot(M_15_5, aes(fdividep, review_score))+  
geom_smooth(se = FALSE) 
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

model=lm(M_15_5$fdivide~M_15_5$review_score) 
summary(model)

Call:
lm(formula = M_15_5$fdivide ~ M_15_5$review_score)

Residuals:
   Min     1Q Median     3Q    Max 
-0.349 -0.186 -0.089  0.073 25.886 

Coefficients:
                     Estimate Std. Error t value            Pr(>|t|)    
(Intercept)          0.358286   0.003151   113.7 <0.0000000000000002 ***
M_15_5$review_score -0.009300   0.000741   -12.6 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.349 on 113320 degrees of freedom
Multiple R-squared:  0.00139,   Adjusted R-squared:  0.00138 
F-statistic:  158 on 1 and 113320 DF,  p-value: <0.0000000000000002
cor(M_15_5$fdivide,M_15_5$review_score)
[1] -0.037256
model=lm(M_15_5$freight_value~M_15_5$review_score) #運費高低與評價的相關係數也很小
summary(model)

Call:
lm(formula = M_15_5$freight_value ~ M_15_5$review_score)

Residuals:
   Min     1Q Median     3Q    Max 
 -21.2   -6.8   -3.6    1.2  390.1 

Coefficients:
                    Estimate Std. Error t value            Pr(>|t|)    
(Intercept)          21.6094     0.1423   151.8 <0.0000000000000002 ***
M_15_5$review_score  -0.4059     0.0335   -12.1 <0.0000000000000002 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 15.8 on 113320 degrees of freedom
Multiple R-squared:  0.0013,    Adjusted R-squared:  0.00129 
F-statistic:  147 on 1 and 113320 DF,  p-value: <0.0000000000000002
cor(M_15_5$freight_value,M_15_5$review_score)
[1] -0.036006
summarise(filter(M_15_5, fdividep < 0.1 ),n())/112650      
summarise(filter(filter(M_15_5, fdividep < 0.2), fdividep >= 0.1),n())/112650  
summarise(filter(filter(M_15_5, fdividep < 0.3), fdividep >= 0.2),n())/112650 
summarise(filter(filter(M_15_5, fdividep < 0.4), fdividep >= 0.3),n())/112650  
summarise(filter(M_15_5, fdividep >= 0.4),n())/112650      
hist(M_15_5$fdividep,las=2,freq=T,xlab="運費佔比",ylab="數量",main="運費佔比的數量")

#商品價格和評分的關係

#商品價格和評分高低的關係。從圖型來看,大約超過3000元的商品,越貴評分越高越貴的商品評分越高。但商品價格和評分高低相關係數相當低,可推測兩者並無太大關連
plot(M_15_1$price,M_15_1$review_score,main="", ylab="評分", xlab="價錢")

ggplot(M_15_1, aes(price, review_score))+
geom_smooth(se = FALSE) 
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

model=lm(M_15_1$price~M_15_1$review_score) 
summary(model)

Call:
lm(formula = M_15_1$price ~ M_15_1$review_score)

Residuals:
   Min     1Q Median     3Q    Max 
  -121    -81    -46     14   6615 

Coefficients:
                    Estimate Std. Error t value            Pr(>|t|)    
(Intercept)          122.860      1.654   74.30 <0.0000000000000002 ***
M_15_1$review_score   -0.592      0.389   -1.52                0.13    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 183 on 113320 degrees of freedom
Multiple R-squared:  2.05e-05,  Adjusted R-squared:  1.17e-05 
F-statistic: 2.32 on 1 and 113320 DF,  p-value: 0.128
cor(M_15_1$price,M_15_1$review_score)
[1] -0.0045252

#不同月分的商品獲得5分評價的比例

#不同月分的商品獲得5分評價的比例 : 可見7、8月商品評價較高
t=as.Date(reviews$review_creation_date)
score5<-filter(reviews, review_score ==5 )
ts5=as.Date(score5$review_creation_date)
table(format(ts5,'%m')) %>% sort()

  09   10   11   01   02   12   04   03   07   06   05   08 
2522 2734 2813 3672 4291 4294 5072 5183 5747 6118 6623 8351 
ratio5<-table(format(ts5,'%m'))/table(format(t,'%m'))
ratio5

     01      02      03      04      05      06      07      08      09      10 
0.56675 0.56520 0.49995 0.53860 0.58887 0.59857 0.62447 0.61536 0.59425 0.59037 
     11      12 
0.57105 0.52999